# -*- coding: utf-8 -*-
## temporarily used, just seg subset. not
import codecs
import json
import re
import os
import jieba
import string
import io
import zhon.hanzi
import csv

## initialize silengly
import logging
jieba.setLogLevel(logging.ERROR)
jieba.initialize()


jieba.set_dictionary('../supporting/jieba.dict.big.txt')
jieba.add_word('开发商'.decode('utf-8'))
jieba.add_word('不言而喻'.decode('utf-8'))
jieba.add_word('依山傍水'.decode('utf-8'))
jieba.add_word('烂人'.decode('utf-8'))
jieba.add_word('强拆'.decode('utf-8'))
jieba.add_word('不予'.decode('utf-8'))
jieba.add_word('还钱'.decode('utf-8'))
jieba.add_word('流失'.decode('utf-8'))
jieba.add_word('手无缚鸡之力'.decode('utf-8'))
jieba.add_word('人肉'.decode('utf-8'))
jieba.add_word('征地'.decode('utf-8'))
jieba.add_word('肇事'.decode('utf-8'))
jieba.add_word('黑社会'.decode('utf-8'))
jieba.add_word('过渡费'.decode('utf-8'))
jieba.add_word('正能量'.decode('utf-8'))
jieba.add_word('青红皂白'.decode('utf-8'))
jieba.add_word('老板'.decode('utf-8'))
jieba.add_word('煤矿'.decode('utf-8'))
jieba.add_word('聚众闹事'.decode('utf-8'))
jieba.add_word('的士'.decode('utf-8'))
jieba.add_word('肢体冲突'.decode('utf-8'))
jieba.add_word('上访'.decode('utf-8'))
jieba.add_word('希望'.decode('utf-8'))
jieba.add_word('打人'.decode('utf-8'))
jieba.add_word('征用'.decode('utf-8'))
jieba.add_word('示威'.decode('utf-8'))
jieba.add_word('抗议'.decode('utf-8'))
jieba.add_word('维权'.decode('utf-8'))
jieba.add_word('沖绳'.decode('utf-8'))



pp = string.punctuation+string.digits+zhon.hanzi.punctuation
pp = list (pp)
stopwords = codecs.open("../supporting/stopwords1.txt", 'r', encoding='utf-8').readlines()[0]
stopwords = stopwords.strip().split(",")
stopwords = list(set(stopwords))
#stopwords = [x.decode("utf-8") for x in stopwords]
stop = pp + stopwords
stop = {x:0 for x in stop}

	
def segmenter(inputdata, delim = ",", position = 0, labelposition = None, output_to_file = True, writedelim = "\t", outputfile_name = "outfile.txt"):
	"""Segment one flle; each line of the file contains some text

	The file can be purely texts, or have texts and other metadata. If the latter is true, provide the position of text in parameter
	
	Args:
		inputdata (str or list): input

			* if input is a list, 
			* if input is a string, it's the filename of the input document

		delim (str): delimeter of the file
		position (int): index of the column of text
		labelposition (int): for supervised learning. The location of the label names
		outputtype (bool): 

			* If true: `file` output to an file in the same folder of filename, with suffix _jieba.csv 
			* Else, output to a list. Each element of the list is still a list, contains segmented strings
		
		writedelim (str): delimeter of the output file.

	Returns:
		list or write-in file: write a file or a list of string

	"""
	
	### preprocessing:

	## add words that you have to keep
	ff = codecs.open('/Users/han/Codes/Jen Pan/keyword_generation/high_frequency_stopwords.txt', 'r', encoding = "utf-8").readlines()
	ff = [x.strip() for x in ff]
	for k in ff:
		jieba.add_word(k)

	## stopwords
	# pp = string.punctuation+string.digits+zhon.hanzi.punctuation
	# pp = list (pp)
	# stopwords = codecs.open("/Users/han/Dropbox/ideology_weibo/code/testcode/stopwords1.txt", 'r', encoding='utf-8').readlines()[0]
	# stopwords = stopwords.strip().split(",")
	# stopwords = list(set(stopwords))
	# #stopwords = [x.decode("utf-8") for x in stopwords]
	# stop = pp + stopwords
	# stop = {x:0 for x in stop}



	filtrate = re.compile(u'[^\u4E00-\u9FA5]')


	if isinstance(inputdata, str):
		text = filtrate.sub(r' ', inputdata)#replace non-Chinese characters
		seg_list = jieba.cut(text, cut_all=False)
		seg_list = [l for l in seg_list if l != u' ']
		return seg_list



	# if isinstance(inputdata, str):

	# 	newname = filename + "_jieba.csv"
	# 	print newname
	# 	#w = open(newname, 'w')
	# 	# w = codecs.open(newname, 'w', encoding = 'utf-8')
	# 	w = codecs.open(newname, 'w', encoding = 'utf-8')

	# 	f = codecs.open(filename, 'r', encoding = "utf-8", errors = "ignore")
	# 	for eachline in f:
	# 		try:
	# 			sall = eachline.strip().split(delim)

	# 			text = sall[position]
	# 			if labelposition:
	# 				label = sall[labelposition]

	# 			text = filtrate.sub(r' ', text)#replace non-Chinese characters

	# 			seg_list = jieba.cut(text, cut_all=False)
	# 			seg_list = [l for l in seg_list if l != u' ']
	# 			#seg_list = [l for l in seg_2gram if l not in stop]

	# 			if labelposition:### for geolocated posts, we do not need exact ids.. just used for training
	# 				#w.write(ids + "\t" + " ".join(seg_2gram).strip() + "\t" + label + "\n")
	# 				w.write(eachline.strip("\n\r") + writedelim + " ".join(seg_list).strip() + writedelim + label + "\n")
	# 			else:
	# 				#w.write(ids + "\t" + " ".join(seg_2gram).strip() + "\n")
	# 				w.write(eachline.strip("\n\r") + writedelim + " ".join(seg_list).strip() + "\n")
	# 		except Exception as e: 
	# 			print e



	# 	f.close()
	# 	w.close()
	
	elif isinstance(inputdata, list):


		if not output_to_file:

			returnl = []
			for text in inputdata:
				try:
					text = filtrate.sub(r' ', text)#replace non-Chinese characters
					seg_list = jieba.lcut(text, cut_all=False)
					seg_list = [l for l in seg_list if l != u' ']
					returnl.append(seg_list)

				except Exception as e: 
					print "segmentation error ", e,
					returnl.append("")
			return returnl
		else:
			# newname = "allwisenews" + "_jieba.csv"
			newname = outputfile_name
			print newname
			w = codecs.open(newname, 'w', encoding = 'utf-8')

			for text in inputdata:
				try:
					text = filtrate.sub(r' ', text)#replace non-Chinese characters
					seg_list = jieba.lcut(text, cut_all=False)
					seg_list = [l for l in seg_list if l != u' ']

					w.write(" ".join(seg_list).strip() + "\n")


				except Exception as e: 
					print "segmentation error ", e, 
			w.close()



		




